options(scipen = 999)

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
#BiocManager::install("recount3")
#BiocManager::install("recount")
#BiocManager::install("sva")

library("recount3")
library("recount")
library("data.table")
library(Biobase)
library(readr)
library(rtracklayer)
library(biomaRt)
library(edgeR) ## For TMM
library(sva) ## For batch correction
library(dplyr)


##### Download data and create the RSE object
GTEx_thyroid = recount3::create_rse_manual(
  project = "THYROID",
  project_home = "data_sources/gtex",
  organism = "human",
  annotation = "gencode_v26",
  type = "gene"
)
save(GTEx_thyroid,file = "/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_RSE.RData")

# GTEx raw counts
assays(GTEx_thyroid)$counts <- recount3::transform_counts(GTEx_thyroid)
# Convert raw counts into TPM
assays(GTEx_thyroid)$TPM <- recount::getTPM(GTEx_thyroid)

##### Remove Duplicate data: sample for same individual, different vial
# keep only the sample with greater sequencing depth (higher total count)

GTEx_tpm = assays(GTEx_thyroid)$TPM
recount3_id = gsub("\\..*", "", colnames(GTEx_tpm))
recount3_id = unlist(lapply(strsplit(recount3_id, split = "-"), function(x){paste(x, collapse = ".")}))

# Get GTEx portal data
gtex.portal_thyroid <- read.csv("/home/ubuntu/gtex.portal_thyroid.txt", sep="")
portal_id = colnames(gtex.portal_thyroid)
# Keep only samples present in portal
GTEx_tpm = GTEx_tpm[,which(recount3_id %in% portal_id)]
# Keep the sample with maximum depth set and remove others
remove_duplicate = function(expression_matrix, unique_ids){
  removed_samples = c()
  for (i in 1:length(unique(unique_ids))){
    id = unique(unique_ids)[i]
    id_loc = which(unique_ids == id)
    if (length(id_loc) > 1){
      subdata = expression_matrix[,id_loc]
      if (ncol(subdata) > 1){
        seq_depth = colSums(subdata)
        removed_samples = c(removed_samples, colnames(subdata)[-which.max(seq_depth)])
      }
    }
  }
  expression_matrix = expression_matrix[,-which(colnames(expression_matrix) %in% removed_samples)]
  return(expression_matrix)
}
# get donor & tissue ids of GTEx:
GTEx_ID = unlist(lapply(strsplit(colnames(GTEx_tpm), split = "-"), function(x){x[2]}))
# Samples with duplicates
which(table(GTEx_ID) > 1)
# No duplicate samples present after removing samples not present in portal
# GTEx_tpm = remove_duplicate(GTEx_tpm, GTEx_ID)

##### Remove genes with counts < 1 TPM in at least 10% samples
expression = GTEx_tpm
expression_cutoff = 1 # 1 TPM
percent_cutoff = 0.1
minSamples = percent_cutoff*ncol(expression) # at least 20% of samples
keep = rowSums(expression > expression_cutoff) >= minSamples
table(keep)
expression_filtered = expression[keep,]
paste0(nrow(expression), ":Total number of genes overlapped between TCGA and GTEx")
paste0(nrow(expression_filtered), ":Number of genes after filtering ", expression_cutoff," cpm in ", minSamples, " samples")

# Convert to log2(1+TPM)
expression_filtered = log2(expression_filtered+1)

# Separate Male and Female Expression
male = rownames(GTEx_thyroid@colData)[which(GTEx_thyroid@colData$gtex.sex == "1")]
female = rownames(GTEx_thyroid@colData)[which(GTEx_thyroid@colData$gtex.sex == "2")]

GTEx.male = expression_filtered[,colnames(expression_filtered) %in% male]
GTEx.female = expression_filtered[,colnames(expression_filtered) %in% female]

#######################
# Check sex-misannotation using Y gene expression PCA
# Get chromosome location
fdata.GTEx = as.data.frame(GTEx_thyroid@rowRanges)
### Get GenecIDs for Y genes & XIST
Y_genes.GTEx = fdata.GTEx$gene_id[which(fdata.GTEx$seqnames == "chrY")]
# PCA for detecting sex misannotation:
library(ggplot2)
Y_expr = expression_filtered[rownames(expression_filtered) %in% Y_genes.GTEx, ]
gender = as.character(GTEx_thyroid@colData$gtex.sex)
gender[which(gender == 1)] = "MALE"
gender[which(gender == 2)] = "FEMALE"
gender = gender[match(colnames(Y_expr), rownames(GTEx_thyroid@colData))]
gender = factor(gender)
pca.GTEx <- prcomp(t(Y_expr))
U <- pca.GTEx$x
theme_set(bigstatsr::theme_bigstatsr(0.8))
qplot(U[, 1], U[, 2], colour = gender) + coord_equal() + 
  ggtitle("PCA of Y genes") + xlab("PC 1") + ylab ("PC 2")


##### Set Y gene expression to "NA" in female samples

GTEx.female[rownames(GTEx.female) %in% Y_genes.GTEx,] = 0

# Remove version numbers (after ".") from gene names
gene = rownames(expression_filtered) = gsub("\\..*","",rownames(expression_filtered)) 

GTEx.all = cbind(gene, GTEx.female, GTEx.male)

GTEx.male = cbind(gene, GTEx.male)
GTEx.female = cbind(gene, GTEx.female)

write.table(GTEx.male,file = "/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_male.txt", row.names = F, col.names = T, sep = '\t', quote = F)
write.table(GTEx.female,file = "/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_female.txt", row.names = F, col.names = T, sep = '\t', quote = F)
write.table(GTEx.all,file = "/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_allSex.txt", row.names = F, col.names = T, sep = '\t', quote = F)

###################################
# Create phenotypic data matrix
library(fgsea)
library(readr)
library(biomaRt)
library(edgeR)

# Thyroid sample_indices
GTEx_thyroid = get(load("/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_RSE.RData"))

# Get sample attributes
pdata.GTEx = as.data.frame(GTEx_thyroid@colData)
pdata.GTEx$gtex.subjid

# Subject Phenotypes
pd = read_tsv("/home/ubuntu/GTEx.v8.phenotypes.txt",skip=10)
pd = data.frame(pd)
pd = pd[match(pdata.GTEx$gtex.subjid, pd$SUBJID),]

pheno_all = cbind(pd, pdata.GTEx)
rownames(pheno_all) = rownames(pdata.GTEx)

write.table(pheno_all, file = "/home/esaha/BONOBO/data/GTEx_thyroid/thyroid_phenotypes.txt", col.names = T, row.names = T)

